"""Convert sgm file of WMT into plain text"""
import argparse
import io
import re

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("in_fn")
    parser.add_argument("out_fn")
    args = parser.parse_args()

    pattern = re.compile(r"<seg id=\"\d+\">(.+?)</seg>")

    in_fn = args.in_fn  # "D:\\mt\\data\\wmt2020\\dev20\\newstest2019-enzh-ref.zh.sgm"
    lines = "\r".join(io.open(in_fn, encoding="utf-8").readlines())

    out_fn = args.out_fn  # "newstest2019-enzh-ref.zh"
    out_f = io.open(out_fn, "w", encoding="utf-8")

    for m in re.finditer(pattern, lines):
        print(m.group(1))
        out_f.write(m.group(1) + "\n")
